{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "400000\n"
     ]
    }
   ],
   "source": [
    "import tldextract\n",
    "import pandas as pd\n",
    "import re\n",
    "from collections import Counter\n",
    "import requests\n",
    "\n",
    "def get_alexa():\n",
    "    \"\"\"\n",
    "    提取alexa 域名 全部为正面域名 结构为 [[域名1, 目标值], [域名2, 目标值]..........]\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    alexa_date = pd.read_csv(\"./top-1m.csv\").iloc[0:200000, :]\n",
    "    return [(\"benign\", tldextract.extract(row[\"num1\"]).domain) for index, row in alexa_date.iterrows()]\n",
    "\n",
    "def get_360_DGA():\n",
    "    \"\"\"\n",
    "    提取360dga域名 结构同上\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    f = open(\"./360_dga.txt\", \"r\", encoding=\"utf-8\").readlines()[0: 200000]\n",
    "    # ls = [re.sub('\\t+', ' ', i).split(' ')[0] for i in f]\n",
    "    # d = sorted(Counter(ls).items(), key=lambda x: x[1], reverse=True)[0: 100]\n",
    "    # print(d)\n",
    "    return [(re.sub('\\t+', ' ', i).split(' ')[0], tldextract.extract(re.sub('\\t+', ' ', i).split(' ')[1]).domain) for i in f]\n",
    "\n",
    "\n",
    "\n",
    "# print(len(get_360_DGA()))\n",
    "def get_zeus_dga():\n",
    "    \"\"\"\n",
    "    提取zeus域名\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    f = open(\"zeus_dga_domains.txt\", \"r\", encoding=\"utf-8\").read()\n",
    "    return f.split(\".\")\n",
    "\n",
    "\n",
    "def get_data():\n",
    "    \"\"\"\n",
    "    拼接返回全部数据 作为数据集\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    return get_alexa() + get_360_DGA()\n",
    "date_set = get_data()\n",
    "print(len(date_set))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from keras.preprocessing import sequence\n",
    "from keras.models import Sequential\n",
    "from keras.layers.core import Dense, Dropout, Activation\n",
    "from keras.layers.embeddings import Embedding\n",
    "from keras.layers.recurrent import LSTM\n",
    "import sklearn\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# date_set[0: 10]\n",
    "features = [i[1] for i in date_set]   #提取域名\n",
    "label = [i[0] for i in date_set]      #提取标签\n",
    "valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(features)))} #构造检索字典\n",
    "y = [0 if x == 'benign' else 1 for x in label]   #目标值修改为0或1\n",
    "max_features = len(valid_chars) + 1  #？？？\n",
    "maxlen = np.max([len(x) for x in features])   #获得输入特征的最大长度\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 24 20 20 24  6\n",
      "  17]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 19 20 26 25 26 37\n",
      "  17]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 18 30 16 17 37 20 20\n",
      "  28]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 37 30  2 21\n",
      "  26]\n",
      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
      "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 15  2 28  2 22 17 21  2\n",
      "  30]] <class 'numpy.ndarray'>\n",
      "[0 0 0 0 0] <class 'numpy.ndarray'>\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "def process_features(valid_chars, features, maxlen):\n",
    "    \"\"\"将特征替换为 检索字典的值 并根据最长特征长度 构造每个特征 无值填充为0\"\"\"\n",
    "    X = [[valid_chars[y] for y in x] for x in features]\n",
    "    X = sequence.pad_sequences(X, maxlen=maxlen)\n",
    "    X = np.array(X)\n",
    "    return X\n",
    "X = process_features(valid_chars, features, maxlen)\n",
    "y = np.array(y)\n",
    "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)   #训练集测试集切割\n",
    "\n",
    "# print(len(X[0]), X[0])\n",
    "# print(len(X[1]), X[1])\n",
    "\n",
    "# print(label[0: 10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 320000 samples, validate on 80000 samples\n",
      "Epoch 1/2\n",
      "320000/320000 [==============================] - 561s 2ms/step - loss: 0.1296 - val_loss: 0.0868\n",
      "Epoch 2/2\n",
      "320000/320000 [==============================] - 549s 2ms/step - loss: 0.0771 - val_loss: 0.0693\n",
      "80000/80000 [==============================] - 45s 557us/step\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "cannot unpack non-iterable numpy.float64 object",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-16-9e4f762f1f19>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     43\u001b[0m     \u001b[0mshow_train_history\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhistory\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'loss'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'val_loss'\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# 训练集误差率与验证集误差率 折线图\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     44\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 45\u001b[1;33m \u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-16-9e4f762f1f19>\u001b[0m in \u001b[0;36mrun\u001b[1;34m()\u001b[0m\n\u001b[0;32m     35\u001b[0m     \u001b[0mhistory\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m128\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m   \u001b[1;31m#训练\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     36\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 37\u001b[1;33m     \u001b[0mscore\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0macc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m128\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     38\u001b[0m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Test score:'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscore\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     39\u001b[0m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Test accuracy:'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0macc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: cannot unpack non-iterable numpy.float64 object"
     ]
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "def show_train_history(train_history,train, velidation):\n",
    "    \"\"\"\n",
    "    可视化训练过程 对比\n",
    "    :param train_history:\n",
    "    :param train:\n",
    "    :param velidation:\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    plt.plot(train_history.history[train])\n",
    "    plt.plot(train_history.history[velidation])\n",
    "    plt.title(\"Train History\")   #标题\n",
    "    plt.xlabel('Epoch')    #x轴标题\n",
    "    plt.ylabel(train)  #y轴标题\n",
    "    plt.legend(['train', 'test'], loc='upper left')  #图例 左上角\n",
    "    plt.show()\n",
    "\n",
    "def build_model(max_features, maxlen):\n",
    "    \"\"\"Build LSTM model\"\"\"\n",
    "    model = Sequential()\n",
    "    model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))\n",
    "    model.add(LSTM(128))\n",
    "    model.add(Dropout(0.5))\n",
    "    model.add(Dense(1))\n",
    "    model.add(Activation('sigmoid'))\n",
    "    model.summary()\n",
    "    model.compile(loss='binary_crossentropy',\n",
    "                  optimizer='rmsprop')\n",
    "\n",
    "    return model\n",
    "def run():\n",
    "    model = build_model(max_features, maxlen) \n",
    "    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)   #训练集测试集切割\n",
    "\n",
    "    history = model.fit(x_train, y_train, epochs=2, batch_size=128, validation_data=(x_test, y_test))   #训练\n",
    "    \n",
    "    score, acc = model.evaluate(x_test, y_test, batch_size=128)\n",
    "    print('Test score:', score)\n",
    "    print('Test accuracy:', acc)\n",
    "    \n",
    "    \"\"\"可视化训练过程\"\"\"\n",
    "    show_train_history(history, 'acc', 'val_acc')  # 训练集准确率与验证集准确率 折线图\n",
    "    show_train_history(history, 'loss', 'val_loss')  # 训练集误差率与验证集误差率 折线图\n",
    "\n",
    "run()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
